#Install the needed packages
rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.2
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(forcats)
library(ModelMetrics)
##
## Attaching package: 'ModelMetrics'
## The following object is masked from 'package:base':
##
## kappa
library(modelr)
##
## Attaching package: 'modelr'
## The following objects are masked from 'package:ModelMetrics':
##
## mae, mse, rmse
#Load the data
load("els.RData")
#Summarize Math Scores, the dependent variable
els%>%summarize(mean(bynels2m,na.rm=TRUE))
## # A tibble: 1 x 1
## `mean(bynels2m, na.rm = TRUE)`
## <dbl>
## 1 45.4
gg<-ggplot(els,aes(x=bynels2m))
gg<-gg+geom_histogram()
gg
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 313 rows containing non-finite values (stat_bin).
gg<-ggplot(els,aes(x=bynels2m))
gg<-gg+geom_density()
gg
## Warning: Removed 313 rows containing non-finite values (stat_density).
#Plotting a simple bivariate regression
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
##
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.658 -8.801 0.400 9.048 39.042
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.0379 0.0993 453.5 <2e-16 ***
## byses1 7.9535 0.1335 59.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.27 on 15323 degrees of freedom
## (964 observations deleted due to missingness)
## Multiple R-squared: 0.1882, Adjusted R-squared: 0.1881
## F-statistic: 3552 on 1 and 15323 DF, p-value: < 2.2e-16
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 44.843273 45.232561
## byses1 7.691958 8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
geom_point(shape=1)+ #specify points
geom_smooth(method=lm) #ask for lm line
g1
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).
# set the intercept of x and y axis at (0,0)
g1 + expand_limits(x=0, y=0)
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).
# change the axis limits
g1 + expand_limits(x=c(0,3), y=c(10, 90))
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).
#Plotting the lm line
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
##
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.658 -8.801 0.400 9.048 39.042
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.0379 0.0993 453.5 <2e-16 ***
## byses1 7.9535 0.1335 59.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.27 on 15323 degrees of freedom
## (964 observations deleted due to missingness)
## Multiple R-squared: 0.1882, Adjusted R-squared: 0.1881
## F-statistic: 3552 on 1 and 15323 DF, p-value: < 2.2e-16
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 44.843273 45.232561
## byses1 7.691958 8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
geom_point(shape=1)+ #specify points
geom_smooth(method=lm) #ask for lm line
g1
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).
#Plotting the loess line
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
##
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.658 -8.801 0.400 9.048 39.042
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.0379 0.0993 453.5 <2e-16 ***
## byses1 7.9535 0.1335 59.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.27 on 15323 degrees of freedom
## (964 observations deleted due to missingness)
## Multiple R-squared: 0.1882, Adjusted R-squared: 0.1881
## F-statistic: 3552 on 1 and 15323 DF, p-value: < 2.2e-16
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 44.843273 45.232561
## byses1 7.691958 8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
geom_point(shape=1)+ #specify points
geom_smooth(method=loess) #ask for loess line
g1
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).
#Plotting a simple bivariate regression
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
##
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.658 -8.801 0.400 9.048 39.042
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.0379 0.0993 453.5 <2e-16 ***
## byses1 7.9535 0.1335 59.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.27 on 15323 degrees of freedom
## (964 observations deleted due to missingness)
## Multiple R-squared: 0.1882, Adjusted R-squared: 0.1881
## F-statistic: 3552 on 1 and 15323 DF, p-value: < 2.2e-16
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 44.843273 45.232561
## byses1 7.691958 8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
geom_point(shape=1)+ #specify points
geom_smooth(method=lm) #ask for lm line
g1 + scale_x_continuous(name="SocioEconomic Status") +
scale_y_continuous(name="Math Scores")+
geom_point(color = "#800080", size = 1, shape = 23)
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).
## Warning: Removed 964 rows containing missing values (geom_point).
There is a positively correlated relationship between Math Score and SocioEconomic Status. As a test taker’s SocioEconomic Status increases, we can predict that in most cases, math scores increase.
# Bring in new data
#Create the plot bringing in another covariate, Parental Ed
g2<-ggplot(data=filter(els,is.na(bypared)==FALSE),
aes(x=byses1,y=bynels2m,
color=as.factor(bypared) #setting the color option
))
## Let's make the dots smaller for readability
g2<-g2+geom_point(size=.25)
## Changing the Legend
g2<-g2+theme(legend.position="bottom" )#, legend.title =
# element_blank())
g2<-g2+ylab("Test Scores")+xlab("Socio Economic Status")
g2 <- g2+ scale_color_discrete(name="Parental Ed")+
geom_smooth(method=lm) #ask for lm lines for the range and relationship
g2
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing missing values (geom_point).
#Transforming the SES scale from -1 to 1 to a percentage
els<-els%>%mutate(byses_p=percent_rank(byses1)*100)
els%>%summarize(mean(byses_p,na.rm=TRUE))
## # A tibble: 1 x 1
## `mean(byses_p, na.rm = TRUE)`
## <dbl>
## 1 49.8
g3<-ggplot(data=filter(els,is.na(bypared)==FALSE),
aes(x=byses1,y=bynels2m,
color=as.factor(bypared) #setting the color option
))
## Let's make the dots smaller for readability
g3<-g2+geom_point(size=.25)
## Changing the Legend
g3<-g2+theme(legend.position="bottom" )#, legend.title =
# element_blank())
g3<-g3+ylab("Test Scores")+xlab("Socio Economic Status")
g3 <- g3+ scale_color_discrete(name="Parental Ed")+
geom_smooth(method=lm) #ask for lm lines for the range and relationship
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
g3
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing missing values (geom_point).
#Getting the regression line for prediction
mod3<-lm(bynels2m~byses_p+
as.factor(bypared),
data=els
);summary(mod3)
##
## Call:
## lm(formula = bynels2m ~ byses_p + as.factor(bypared), data = els)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.605 -8.814 0.401 9.135 39.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.653393 0.405322 85.496 <2e-16 ***
## byses_p 0.216707 0.006004 36.094 <2e-16 ***
## as.factor(bypared)2 0.947310 0.469370 2.018 0.0436 *
## as.factor(bypared)3 -0.520123 0.536834 -0.969 0.3326
## as.factor(bypared)4 0.461198 0.549184 0.840 0.4010
## as.factor(bypared)5 -0.022320 0.549197 -0.041 0.9676
## as.factor(bypared)6 -0.609384 0.576310 -1.057 0.2904
## as.factor(bypared)7 -0.295112 0.665803 -0.443 0.6576
## as.factor(bypared)8 -1.060373 0.737483 -1.438 0.1505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.28 on 15316 degrees of freedom
## (964 observations deleted due to missingness)
## Multiple R-squared: 0.1872, Adjusted R-squared: 0.1867
## F-statistic: 440.8 on 8 and 15316 DF, p-value: < 2.2e-16
#Let's predict test score with an average SES and Parent Ed of 8
34.653393 + 0.216707 * 8
## [1] 36.38705
rmse_3<-modelr::rmse(mod3,els)
rmse_3
## [1] 12.27983